 /*
Created by : Clemence, August 2018

This Dofile : 
			- Focuses on Export data, generate datasets to compare the different sources and make descriptive statistics 

Issues : Different Data sources that are supposed to be equivalent

	1 - Classic customs flow 2 and 4 :  1995-2010
	2 - flow 2  with currency : 2011 - 2017
	3 - Flow with buyer id : 1995 - 2017

Test to be done :

	- Check the consistency of flow 4 between the classic custom flow source and the flows with buyer ID
	- Check the evolution of value traded/nb of siren between 2010 and 2011 (new source)
	
What is done :

		1) Keep only valid siren
		2) Keep only valid country code
		3) Keep only valif flows (no flow should be export=0 & kgs=0)
		4) Only valid nc8 code
		5) Only valid buyer number
Some remaining problem :
		- For flows 1 & 2 after 2011 : there observations with valstat==0 & kgs==0, i don't understand why, no special patterns in terms of year or iso2
		- Similar obseravtion euros==0 & kgs==0 in buyers data
Output :

		- exp_by_year, exhaustive exports flows 2 and 4 from 1995 to 2017, data are aggregated at the siren*iso2*year level. 
		  Data are exhaustive because they include under threshold observations withut products
		- exp_by_nc8year, exports flows 2 and 4 from 1995 to 2017 siren*iso2*year*nc8 level.
		- Excel file: SummaryStatistics_24
		  
To be deleted later 
		
		- Clean Data from classic mensuel : cm_by_year.dta,  cm_by_nc8year
		- Clean data from currency data : cur_by_year and cur_by_nc8year
		- Firm to firm data : buyer_by_year, buyer_by_nc8year and buyer_by_buyernc8year

*/

set more off, perma
cd $sourcepath	
do "$dopath\id_group.do"
	
*1 Source : Classic customs flow	
use flux13_cm.dta, clear

* Variables an   mois     flux  siren   nc8    cpa6    pyod   payp  euros  indicmasse  kgs unites   year
drop payp // no obs only for imports
rename pyod iso2
rename euros import

*1.1 Keep Valid siren numbers
tostring siren, replace
replace siren="0"+siren if length(siren)==8 
replace siren="00"+siren if length(siren)==7


#delimit ; 
 drop if length(siren)<=5 ;
drop if siren=="000000000"|siren=="111111111"|siren=="222222222" |siren=="202020202"
|siren=="777777777"|siren=="888888888"|siren=="999999999";
#delimit cr


*1.2 Keep valid pyod


/*
*Codes QQ,QU,QV,QW are anonymized for defense reason ==> dropped
*Codes TP XA XC XE XF XG XH XI XK XL XM XO XP XR XS  are user-defined codes ==> dropped"
*https://www.iso.org/fr/glossary-for-iso-3166.html
Codes attribués par les utilisateurs - Si des utilisateurs ont besoin de codes pour représenter des noms de pays qui ne font pas partie 
de la présente partie de l'ISO 3166, les séries de lettres AA, QM à QZ, XA à XZ et ZZ et les séries AAA à AAZ, QMA à QZZ, XAA à XZZ et ZZA 
à ZZZ et la série des nombres 900 à 999 peuvent être employées.
* XU = Belgiuem and Luxembourg until 99
* YU Yougoslavia until 2003
* CS : Serbia Montenegra 2004-2005
* TP : Portuguese Timor in 2001-2002
* AN : Netherland Antilles from 95- 2010
*/
drop drop
replace iso2="BE" if iso2=="XU" & year<1999
gen drop=1 if  inlist(iso2,"QQ","QU","QV","QW","EU","FR")
replace drop=1 if inlist(iso2,"XA","XC","XE","XF","XG","XU")
replace drop=1 if inlist(iso2,"XH","XI","XK","XL","XM","XN")
replace drop=1 if inlist(iso2,"XO","XP","XR","XS","XV","XZ")

drop if drop==1
compress
save clean_flux_13.dta, replace

*Aggregate at the siren*iso2*year level
collapse (sum) import kgs unites, by(siren iso2 year flux)
sort siren iso2 year
gen source="Classic Mensuel"
compress
save cm_by_year.dta, replace


*1.3 Clean Products
use clean_flux_13, clear
tostring nc8, replace
replace nc8="0"+nc8 if length(nc8)==7
replace nc8="00"+nc8 if length(nc8)==6
replace nc8="000"+nc8 if length(nc8)==5
replace nc8="0000"+nc8 if length(nc8)==4
replace nc8="00000"+nc8 if length(nc8)==3
replace nc8="000000"+nc8 if length(nc8)==2
replace nc8="0000000"+nc8 if length(nc8)==1
g nc8group=(nc8=="99500000" | nc8=="98807300" | nc8=="98808400" | nc8=="98809900" | nc8=="98808500" )
g nc8part = (nc8=="99699999" |nc8=="99050000" |nc8=="99190000" |nc8=="99300000"|nc8=="99310000")

drop if nc8part == 1
drop if nc8group==1
drop nc8group nc8part 
compress
keep import nc8 iso2 siren kgs unites mois year flux
gen source="cm"
save clean_flux_13.dta, replace

*Aggregate at the siren*iso2*year*nc8 level
collapse (sum) import kgs unites, by(siren iso2 year nc8 flux)
sort siren iso2 year
gen source="Classic Mensuel"
save cm_by_nc8year.dta, replace



*2 Source : Currency customs flows 1
cd $sourcepath
use x_cur.dta, clear
keep if flux==1

rename pyod iso2
rename euros import

*2.1 Keep Valid siren numbers
tostring siren, replace
replace siren="0"+siren if length(siren)==8 
replace siren="00"+siren if length(siren)==7



#delimit ; 
 drop if length(siren)<=5 ;
drop if siren=="000000000"|siren=="111111111"|siren=="222222222" |siren=="202020202"
|siren=="777777777"|siren=="888888888"|siren=="999999999";
#delimit cr

*2.2 Keep valid pyod
/*
Many EU code as destination country :
	- can stand for US "Etats Unis"
	- codes used when people don't know what to put
	- code used for re-exported goods from switzerland
FR code as the origin of Imports can happen when
	a) French goods are imported from another cpountry and the re-exported from this country to France. 
       Ex: A Belgian  wine delaer imports champagn and then sale part of it in France 

	b) Defective exported  goods taht are re-imported, temporary exports
	c) Misreporting
Decison : Drop thiese flows
FR code as the destination country
	Could be related to exports from DOM-TOM 
	==> dropped
*/
gen drop=1 if  inlist(iso2,"QP","QU","QV","QW","FR")
replace drop=1 if inlist(iso2,"XC","XK","XL","XS")
replace drop=1 if iso2=="EU"



drop if drop==1
drop drop

compress
save clean_m_cur.dta, replace


*Aggregate at the siren*iso2*year level
collapse (sum) import kgs unites, by(siren iso2 year flux)
gen source="Currency"
sort siren iso2 year
append using cm_by_year
save imp_by_year.dta, replace
erase cm_by_year.dta



*2.3 Clean Product
use clean_m_cur, clear
tostring nc8, replace
replace nc8="0"+nc8 if length(nc8)==7
replace nc8="00"+nc8 if length(nc8)==6
replace nc8="000"+nc8 if length(nc8)==5
replace nc8="0000"+nc8 if length(nc8)==4
replace nc8="00000"+nc8 if length(nc8)==3
replace nc8="000000"+nc8 if length(nc8)==2
replace nc8="0000000"+nc8 if length(nc8)==1
g nc8group=(nc8=="99500000" | nc8=="98807300" | nc8=="98808400" | nc8=="98809900" | nc8=="98808500" )
g nc8part = (nc8=="99699999" |nc8=="99050000" |nc8=="99190000" |nc8=="99300000"|nc8=="99310000")
drop if nc8part == 1
drop if nc8group==1
drop nc8group nc8part 
save clean_m_cur, replace

*Aggregate at the siren*iso2*year*nc8 level
collapse (sum) import kgs unites, by(siren iso2 year nc8 flux)
gen soure="Currency"
sort siren iso2 year
append using cm_by_nc8year
save imp_by_nc8year.dta, replace
erase cm_by_nc8year.dta

use clean_m_cur, clear
gen source="currency"
keep import nc8 iso2 siren kgs unites mois year flux source
append using clean_flux_13.dta
save clean_flux_13.dta, replace
erase clean_m_cur.dta


*3 Source : Buyer custom flows
use m_deb.dta, clear


rename pyod iso2
rename euros import

*3.1 Keep Valid siren numbers
tostring siren, replace
replace siren="0"+siren if length(siren)==8 
replace siren="00"+siren if length(siren)==7


#delimit ; 
 drop if length(siren)<=5 ;
drop if siren=="000000000"|siren=="111111111"|siren=="222222222" |siren=="202020202"
|siren=="777777777"|siren=="888888888"|siren=="999999999";
#delimit cr

*3.3 Keep relevant regim
set more off
keep if regdem==11 | regdem==19



*3.4 : remove flows with export=0 and kgs=0

drop if import==0 & kgs==0



preserve 
*Aggregate at the siren*iso2*year level
collapse (sum) import kgs unites , by(siren iso2 year flux)
gen source="import DEB"
sort siren iso2 year
compress
save mdeb_by_year.dta, replace
append using imp_by_year.dta
save imp_by_year.dta, replace
restore



*3.5 Clean CN8 products
* We want to keep only valid NC8 codes
* NC8 99500000 is a code used for small transactions and "envois groupés" 
* NC8 9880XX00 - XX being 73, 84, 99 or 85 - is a code used for large manufacturing or commercial firms which import values above 3 million euros and benefit from a simplified code grouping all products together
* L'arrêté du 26 décembre 2000 (modifié) fixe les conditions d'utilisation de la nomenclature spécifique des produits 
* 99050000 : personnal belongings of moving individuals
* 99190000 : belongings of catastrophy victims (coffin...) 
* 9930 : Refuelling
* 9931 : goods for offsghores installations/infrastructures
* Seek for more informations in the note. 
* In flux 13, non-valid cn8 account for about 0.5 percent of transactions, 97.31 of which are "99500000". 97.49% of non-valid cn8 are in flux 3 : no 99500000
tostring nc8, replace
replace nc8="0"+nc8 if length(nc8)==7
replace nc8="00"+nc8 if length(nc8)==6
replace nc8="000"+nc8 if length(nc8)==5
replace nc8="0000"+nc8 if length(nc8)==4
replace nc8="00000"+nc8 if length(nc8)==3
replace nc8="000000"+nc8 if length(nc8)==2
replace nc8="0000000"+nc8 if length(nc8)==1
g nc8group=(nc8=="99500000" | nc8=="98807300" | nc8=="98808400" | nc8=="98809900" | nc8=="98808500" )
g nc8part = (nc8=="99699999" |nc8=="99050000" |nc8=="99190000" |nc8=="99300000"|nc8=="99310000")

drop if nc8part == 1
drop if nc8group==1
drop nc8group nc8part 


preserve 
*Aggregate at the siren*iso2*nc8*year level
collapse (sum) import kgs unites , by(siren iso2 year flux nc8)
gen source="import DEB"
sort siren iso2 year
save mdeb_by_nc8year.dta, replace
append using imp_by_nc8year.dta
save imp_by_nc8year.dta, replace
restore

 use imp_by_year.dta, clear
 collapse (sum) import kgs unites, by(siren iso2 year)
sort siren iso2 year
compress
save imp_by_sireniso2year_19932017.dta, replace

use imp_by_nc8year.dta, clear
 collapse (sum) import kgs unites, by(siren iso2 year nc8)
sort siren iso2 year
compress
save imp_by_sirennc8iso2year_19932017.dta, replace


erase mdeb_by_nc8year.dta
erase mdeb_by_year.dta
erase flux13_cm.dta
erase imp_by_year.dta
erase imp_by_nc8year.dta
erase m_deb.dta


